Analysis of New York State’s 2020-2021 probability-based sampling results

This markdown uses the spsurvey package developed by Michael Dumelle, Tom Kincaid, Tony Olsen, and Marc Weber at EPA to analyze the results of New York’s spatially-balanced probability-based survey in 2020-2021. The results include categorical condition extent estimates for statewide conditions, cumulative distribution functions for water quality parameters, and comparisons to the National Lakes Assessment (nationally and regionally), New Hampshire and to New York’s historical data (LMAS samples since 2010).

Load packages

start.time <- Sys.time()

library(tidyverse)
library(huxtable)
library(ggplot2)
library(lubridate)
library(egg)
library(gridExtra)
library(ggpattern)
library(spsurvey) #This code was written for spsurvey v5.0.0

`%!in%` <- Negate(`%in%`)

Load data

To run cat_analysis(), the dataframe will need to include columns for the categorical variable you are interested in, weight for each site, xcoordinate and ycoordinate. Optionally, you can also include a subpopulation or stratum column to separate results and site IDs for two-stage analysis (for example, if there are repeated sites).

In this script, weight is adjusted because of the proportional design. In the future, this can be excluded as the design should include weights already.

I am designating relevant variables from the dataset as well as creating new variables: mean dissolved oxygen from 0-2m, epilimentic means for ph and spconductance, total nitrogen and DIN:TP ratio.

# retrieve raw data from database
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Current")
source("new_database/Reading.LMAS.Data.R")
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")

rm(list=setdiff(ls(), c('newdata',"start.time")))
#list relevant variables
lab<-newdata %>%
  filter(SAMPLE_DATE>'2020-01-01') %>%
  mutate(combined=paste(CHARACTERISTIC_NAME,
                        INFORMATION_TYPE,
                        RSLT_RESULT_SAMPLE_FRACTION,
                        sep = "_"))  %>%
  select(LAKE_HISTORY_ID,
         SAMPLE_DATE,
         combined,
         RSLT_RESULT_VALUE,
         RSLT_LABORATORY_QUALIFIER,
         RSLT_VALIDATOR_QUALIFIER,
         RSLT_PROFILE_DEPTH) %>%
  mutate(RSLT_RESULT_VALUE=ifelse(!is.na(RSLT_LABORATORY_QUALIFIER)&(RSLT_LABORATORY_QUALIFIER=="U"|RSLT_LABORATORY_QUALIFIER=="UE"),"0",RSLT_RESULT_VALUE),
         RSLT_RESULT_VALUE=as.numeric(RSLT_RESULT_VALUE)) %>%
  filter(!is.na(RSLT_RESULT_VALUE),
         is.na(RSLT_VALIDATOR_QUALIFIER)|(RSLT_VALIDATOR_QUALIFIER!="R"),
         combined %in% c('CHLOROPHYLL A_OW_TOTAL',
                         'PHOSPHORUS, TOTAL_OW_TOTAL',
                         "MICROCYSTIN_OW_NA",
                         "NITROGEN, NITRATE (AS N)_OW_TOTAL",
                         "NITROGEN, NITRATE-NITRITE_OW_TOTAL",
                         "NITROGEN, KJELDAHL, TOTAL_OW_TOTAL",
                         "NITROGEN, TOTAL_OW_TOTAL",
                         "DISSOLVED OXYGEN_DP_NA",
                         "TRUE COLOR_OW_TOTAL",
                         "PH_DP_NA",
                         "SPECIFIC CONDUCTANCE_DP_NA",
                         "TEMPERATURE_DP_NA",
                         "DEPTH, SECCHI DISK DEPTH_SD_NA",
                         "CALCIUM_OW_TOTAL",
                         "CHLORIDE_OW_TOTAL",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA",
                         "ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL",
                         "ARSENIC_BS_TOTAL",
                         "ARSENIC_OW_TOTAL",
                         "IRON_BS_TOTAL",
                         "IRON_OW_TOTAL",
                         "MAGNESIUM_BS_TOTAL",
                         "MAGNESIUM_OW_TOTAL",
                         "CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED",
                         "NITROGEN, AMMONIA (AS N)_OW_TOTAL")) %>%
  select(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_RESULT_VALUE,RSLT_PROFILE_DEPTH) %>%
  distinct(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_PROFILE_DEPTH,.keep_all = TRUE) %>%
  rename(LAKE_ID=LAKE_HISTORY_ID,
         chemical_name=combined,
         result_value=RSLT_RESULT_VALUE,
         profile_depth=RSLT_PROFILE_DEPTH)

# dissolved oxygen:
# keep DO values between 0 and 2m depth
lab2 <- lab %>% 
  mutate(keep=case_when(
    !(chemical_name=="DISSOLVED OXYGEN_DP_NA") ~ "no",
    chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth <= 2 ~ "yes",
    chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth > 2 ~ "no")) %>%
  filter(keep!="no") %>%
  select(-keep)

# mean DO for each lake each date
DO <- aggregate(lab2$result_value,list(lab2$LAKE_ID,lab2$SAMPLE_DATE),FUN=mean) %>% 
  mutate(chemical_name="DISSOLVED OXYGEN_epi") %>% 
  rename(LAKE_ID=Group.1,
         SAMPLE_DATE=Group.2,
         result_value=x)

#thermocline
thermocline<-lab %>% 
  filter(chemical_name=='TEMPERATURE_DP_NA') %>% 
  mutate(thermocline=NA)
LID<-thermocline$LAKE_ID[1]
date<-thermocline$SAMPLE_DATE[1]
depth<-thermocline$profile_depth[1]
temp<-thermocline$result_value[1]

for(i in seq(nrow(thermocline))){
  current<-thermocline[i,]
  if(current$LAKE_ID==LID&current$SAMPLE_DATE==date){
    depth=current$profile_depth
    if((temp-current$result_value)>1){
      thermocline$thermocline[i]<-current$profile_depth
    }
    temp=current$result_value
  }
  else{
    LID=current$LAKE_ID
    date=current$SAMPLE_DATE
    depth=current$profile_depth
    temp=current$result_value
  }
}

thermocline<-thermocline %>%  #pull the lowest value for all
  group_by(LAKE_ID,SAMPLE_DATE) %>% 
  mutate(thermocline=min(thermocline, na.rm=T)) %>% 
  ungroup() %>% 
  mutate(thermocline=ifelse(thermocline==Inf,NA,thermocline)) %>% 
  select(LAKE_ID,SAMPLE_DATE,thermocline,profile_depth) %>% 
  filter(!is.na(thermocline)) %>%   #remove those without a thermocline
  distinct()

#epilimentic means for ph and spconductance
epi<-lab %>% 
  inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>% 
  filter(chemical_name %in% c("PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA")) %>% 
  mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>% 
  distinct() %>% 
  arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth) 

epi<-epi %>% 
  filter(profile_depth<=thermocline) %>% 
  distinct() %>% 
  group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>% 
  summarize(Mean=mean(result_value,na.rm=TRUE),
            n=n()) %>% 
  filter(n>2) %>% 
  select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)

epi<-epi %>% 
  mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>% 
  rename(result_value=Mean) %>% 
  mutate(chemical_name=case_when(
    chemical_name=="PH_DP_NA"~"PH_epi",
    chemical_name=="SPECIFIC CONDUCTANCE_DP_NA"~"SPECIFIC CONDUCTANCE_epi"
  ))

#hypolimnetic means for ph and dissolved oxygen
hypo<-lab %>% 
  inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>% 
  filter(chemical_name %in% c("PH_DP_NA","DISSOLVED OXYGEN_DP_NA")) %>% 
  mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>% 
  distinct() %>% 
  arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth) 

hypo<-hypo %>% 
  filter(profile_depth>=thermocline) %>% 
  distinct() %>% 
  group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>% 
  summarize(Mean=mean(result_value,na.rm=TRUE),
            n=n()) %>% 
  filter(n>2) %>% 
  select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)

hypo<-hypo %>% 
  mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>% 
  rename(result_value=Mean) %>% 
  mutate(chemical_name=case_when(
    chemical_name=="PH_DP_NA"~"PH_hypo",
    chemical_name=="DISSOLVED OXYGEN_DP_NA"~"DISSOLVED OXYGEN_hypo"
  ))

# read in site data
sites<-read.csv("Probability_Based_Sites_2020_2021.csv")
sites<-sites %>% 
  rename(siteID=SITE_ID,
         xcoord=LON_DD83,
         ycoord=LAT_DD83,
         Eval_Status=EvalStatus) %>% 
  filter(Eval_Status!="") %>%    #removing sites that we haven't yet evaluated
  mutate(Eval_Status=trimws(Eval_Status))

# restrict to only the data in the probability study and spread the data
att<-merge(lab,epi,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>% 
  mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>% 
  select(-result_value.x,-result_value.y)
att<-merge(att,hypo,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>% 
  mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>% 
  select(-result_value.x,-result_value.y)
att<-merge(att,DO,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE)%>% 
  mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>% 
  filter(!chemical_name %in% c("DISSOLVED OXYGEN_DP_NA","PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA","TEMPERATURE_DP_NA")) %>% 
  select(-result_value.x,-result_value.y)

att<-merge(att,sites,by=c('LAKE_ID'),all.y = TRUE) %>% distinct()
att<-att %>% spread(chemical_name,result_value,fill = NA)

att<-att %>% filter((Eval_Status=="Target_Sampled"&!is.na(`CHLOROPHYLL A_OW_TOTAL`)&!is.na(`PHOSPHORUS, TOTAL_OW_TOTAL`)) |
                      (Eval_Status!="Target_Sampled") |
                      (LAKE_ID %in% c("0703UWBXXX1","0801KAY0984A","1203MET0821","0602LUD0099","0801GUL0969")))

#creating thresholds
att<-att %>% 
  # remove fields in the sites table that aren't relevant
  select(-Accessible,-Comments,-Contact,-STATUS) %>%
  # create total nitrogen
  mutate(`NITROGEN, TOTAL`=case_when(
    !is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~
      (`NITROGEN, NITRATE-NITRITE_OW_TOTAL`+`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`),
    is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~ `NITROGEN, TOTAL_OW_TOTAL`)) %>%
  # DIN:TP
  mutate(DINTP=`NITROGEN, NITRATE (AS N)_OW_TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`) %>% 
  # trophic status
  mutate(phos_trophic=case_when(
    `PHOSPHORUS, TOTAL_OW_TOTAL`<=0.01 ~ "Oligotrophic",
    between(`PHOSPHORUS, TOTAL_OW_TOTAL`,0.01,0.02) ~ "Mesotrophic",
    `PHOSPHORUS, TOTAL_OW_TOTAL`>=0.02 ~ "Eutrophic")) %>%
  mutate(chla_trophic=case_when(
    `CHLOROPHYLL A_OW_TOTAL`<=2 ~ "Oligotrophic",
    between(`CHLOROPHYLL A_OW_TOTAL`,2,8) ~ "Mesotrophic",
    `CHLOROPHYLL A_OW_TOTAL`>=8 ~ "Eutrophic")) %>%
  # EPA thresholds
  mutate(TP_threshold=case_when(
    `PHOSPHORUS, TOTAL_OW_TOTAL`<=0.016 ~ "Good",
    between(`PHOSPHORUS, TOTAL_OW_TOTAL`, 0.016, 0.0279) ~ "Fair",
    `PHOSPHORUS, TOTAL_OW_TOTAL`>=0.0279 ~ "Poor")) %>%
  mutate(TN_threshold=case_when(
    `NITROGEN, TOTAL`<=0.428 ~ "Good",
    between(`NITROGEN, TOTAL`, 0.428, 0.655) ~ "Fair",
    `NITROGEN, TOTAL`>=0.655 ~ "Poor")) %>%
  mutate(CHLA_threshold=case_when(
    `CHLOROPHYLL A_OW_TOTAL`<=4.52 ~ "Good",
    between(`CHLOROPHYLL A_OW_TOTAL`, 4.52, 8.43) ~ "Fair",
    `CHLOROPHYLL A_OW_TOTAL`>=8.43 ~ "Poor")) %>%
  # microcystin
  mutate(microcystin=case_when(
    is.na(`MICROCYSTIN_OW_NA`) ~"Non-detect",
    `MICROCYSTIN_OW_NA`<8 ~ "Microcystin Detected",
    `MICROCYSTIN_OW_NA`>=8 ~ "Most disturbed")) %>%
  # dissolved oxygen
  mutate(d.oxygen=case_when(
    `DISSOLVED OXYGEN_epi`<=3 ~ "Poor",
    between(`DISSOLVED OXYGEN_epi`, 3, 5) ~ "Fair",
    `DISSOLVED OXYGEN_epi`>=5 ~ "Good")) %>% 
  #nutrient limitation
  mutate(N_LIMIT=case_when(
    `NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`<=10 ~ "N-limited",
    between(`NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`, 10, 20) ~ "Co-limited",
    `NITROGEN, TOTAL`/`PHOSPHORUS, TOTAL_OW_TOTAL`>=20 ~ "P-limited")) %>% 
  #secchi
  mutate(secchi=case_when(
    `DEPTH, SECCHI DISK DEPTH_SD_NA`<=2 ~ "Eutrophic",
    between(`DEPTH, SECCHI DISK DEPTH_SD_NA`, 2, 5) ~ "Mesotrophic",
    `DEPTH, SECCHI DISK DEPTH_SD_NA`>=5 ~ "Oligotrophic")) %>% 
  #zebra mussels
  mutate(zebra=case_when(
    CALCIUM_OW_TOTAL<=10 ~ "Not susceptible",
    between(CALCIUM_OW_TOTAL, 10, 20) ~ "May be susceptible",
    CALCIUM_OW_TOTAL>=20 ~ "Highly susceptible")) %>% 
  #cslap
  mutate(conductance=case_when(
    `SPECIFIC CONDUCTANCE_epi`<=125 ~ "Soft",
    between(`SPECIFIC CONDUCTANCE_epi`, 125, 250) ~ "Average",
    `SPECIFIC CONDUCTANCE_epi`>=250 ~ "Hard")) %>%
  mutate(color=case_when(
    `TRUE COLOR_OW_TOTAL`<=10 ~ "Uncolored",
    between(`TRUE COLOR_OW_TOTAL`, 10, 30) ~ "Weak",
    `TRUE COLOR_OW_TOTAL`>=30 ~ "High")) %>% 
  mutate(ph=case_when(
    `PH_epi`<=6.5 ~ "Acidic",
    between(`PH_epi`, 6.5, 7.5) ~ "~Neutral",
    between(`PH_epi`, 7.5, 8.5) ~ "Slightly alk",
    `PH_epi`>=8.5 ~ "Highly alk")) %>% 
  #leech
  mutate(leech=case_when(
    `PHOSPHORUS, TOTAL_OW_TOTAL`<=0.030 & `TRUE COLOR_OW_TOTAL`<=20 ~ "Blue",
    `PHOSPHORUS, TOTAL_OW_TOTAL`>0.030 & `TRUE COLOR_OW_TOTAL`<=20 ~ "Green",
    `PHOSPHORUS, TOTAL_OW_TOTAL`<=0.030 & `TRUE COLOR_OW_TOTAL`>20 ~ "Brown",
    `PHOSPHORUS, TOTAL_OW_TOTAL`>0.030 & `TRUE COLOR_OW_TOTAL`>20 ~ "Murky"
  )) %>% 
  #chloride
  mutate(chloride=case_when(
    `CHLORIDE_OW_TOTAL` <= 35 ~ "Low",
    between(`CHLORIDE_OW_TOTAL`,35,250) ~ "Medium",
    `CHLORIDE_OW_TOTAL` >= 250 ~ "High",
    Eval_Status=="Target_Sampled" & is.na(`CHLORIDE_OW_TOTAL`) ~ "Low")) %>% 
  #chlorophyll
  rename(CHLOROPHYTE=`CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA`,
         CRYPTOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA`,
         CYANOBACTERIA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA`,
         DINOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA`,
         PROBE_TOTAL=`CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA`) %>% 
  mutate(chlorophyte=case_when(
    CHLOROPHYTE/PROBE_TOTAL<=0.25 ~ "Low",
    between(CHLOROPHYTE/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    CHLOROPHYTE/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  mutate(cryptophyta=case_when(
    CRYPTOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
    between(CRYPTOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    CRYPTOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  mutate(cyanobacteria=case_when(
    CYANOBACTERIA/PROBE_TOTAL<=0.25 ~ "Low",
    between(CYANOBACTERIA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    CYANOBACTERIA/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  mutate(dinophyta=case_when(
    DINOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
    between(DINOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    DINOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  #alkalinity
  mutate(alkalinity=case_when(
    `ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL` <= 60 ~ "Soft",
    between(`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,60,120) ~ "Moderately hard",
    between(`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,120,180) ~ "Hard",
    `ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL` >= 180 ~ "Very hard"))

# Create a Target/NonTarget status variable
att<-att %>% 
  mutate(statusTNT="Target",
         statusTNT=ifelse(Eval_Status=="NonTarget","NonTarget",statusTNT))

# remove duplicates (multiple samples of same lake)
att <- att %>%
  mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
  mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
  group_by(siteID) %>%
  mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
  filter(has_aug & month == 8 |
           !has_aug) %>% # filter only august dates from sites with or all from those without
  slice_max(n = 1,
            order_by = SAMPLE_DATE,
            with_ties = F) %>% # change with_ties to TRUE to discard NA values
  select(-has_aug)

att<-att %>% 
  ungroup() %>% 
  mutate(WgtAdj=case_when(
    PROB_CAT=="(1,4]"~(1828/29),
    PROB_CAT=="(4,10]"~(2490/15),
    PROB_CAT=="(10,20]"~(1003/18),
    PROB_CAT=="(20,50]"~(616/20),
    PROB_CAT==">50"~(500/33)))


#read in data on excursions
excursions <- read.csv("probability.data.excursions.csv")
excursions <- excursions %>% 
  select(parameter,n_exceedances,LAKE_ID) %>% 
  spread(parameter,n_exceedances,fill = 0) %>% 
  mutate(LAKE_ID=toupper(LAKE_ID)) %>% 
  mutate(all_params=case_when(
    ammonia==0 & chlorophyll_a==0 & dissolved_oxygen==0 & nitrite==0 & ph==0 & phosphorus==0 ~0,
    LAKE_ID=TRUE~1
  )) %>% 
  rename(AMMONIA_excursions=ammonia,
         DO_excursions=dissolved_oxygen,
         NITRITE_excursions=nitrite,
         PH_excursions=ph,
         PHOSPHORUS_excursions=phosphorus,
         Fully_support=all_params)

att<-merge(att,excursions,by="LAKE_ID")

Additionally, I will load in NLA, NAP and NH data:

# read in NLA data
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")

chem <- read.csv("nla_2017_water_chemistry_chla-data.csv")

chem <- chem %>% 
  filter(ANALYTE %in% c("NTL","CHLA","PTL","NITRATE_N","COLOR","CHLORIDE","MAGNESIUM","DOC","COND","AMMONIA_N","CALCIUM","PH"),
         NARS_FLAG == "") %>% 
  select(UID,SITE_ID,DATE_COL,ANALYTE,RESULT) %>% 
  pivot_wider(names_from= ANALYTE ,values_from = RESULT,) %>% 
  rename(TN=NTL,TP=PTL)

secchi <- read.csv("nla_2017_secchi-data.csv") %>% 
  mutate(SECCHI=(DISAPPEARS+REAPPEARS)/2,
         DATE_COL=mdy(DATE_COL)) %>% 
  select(UID,SITE_ID,DATE_COL,SECCHI)

oxygen <- read.csv("nla_2017_profile-data.csv")

oxygen <- oxygen %>% 
  select(UID,SITE_ID,DATE_COL,DEPTH,OXYGEN) %>% 
  # dissolved oxygen:
  # keep DO values between 0 and 2m depth
  mutate(keep=case_when(
    DEPTH <= 2 ~ "yes",
    DEPTH > 2 ~ "no")) %>%
  filter(keep=="yes") %>%
  select(-keep)

# mean DO for each lake
DO <- aggregate(oxygen$OXYGEN,list(oxygen$SITE_ID,oxygen$DATE_COL),FUN=mean)
# add back in
oxygen <- merge(oxygen,DO,by.x=c("SITE_ID","DATE_COL"),by.y=c("Group.1","Group.2"),all.x=TRUE)
oxygen$x <- as.numeric(oxygen$x)
oxygen <- oxygen %>% 
  select(-c(DEPTH,OXYGEN)) %>%
  rename(OXYGEN=x) %>% 
  distinct()

att.nla <- merge(chem,oxygen,by=c("UID","SITE_ID","DATE_COL"),all.y=TRUE,all.x=TRUE) %>% 
  mutate(DATE_COL=dmy(DATE_COL))
att.nla <- merge(att.nla,secchi,by=c("UID","SITE_ID","DATE_COL"),all.x=TRUE)
att.nla[,4:17] <- lapply(att.nla[,4:17],as.numeric)
att.nla$DINTP <- as.numeric(att.nla$NITRATE_N)/(as.numeric(att.nla$TP)/1000)

# selecting parameters and adding trophic status

att.nla<-att.nla %>% 
  # trophic status
  mutate(phos_trophic=case_when(
    TP<=10 ~ "oligotrophic",
    between(TP,10,20) ~ "mesotrophic",
    TP>=20 ~ "eutrophic")) %>%
  mutate(chla_trophic=case_when(
    CHLA<=2 ~ "oligotrophic",
    between(CHLA,2,8) ~ "mesotrophic",
    CHLA>=8 ~ "eutrophic")) %>%
  # EPA thresholds
  mutate(TP_threshold=case_when(
    TP<=16 ~ "Good",
    between(TP, 16, 27.9) ~ "Fair",
    TP>=27.9 ~ "Poor")) %>%
  mutate(TN_threshold=case_when(
    TN<=0.428 ~ "Good",
    between(TN, 0.428, 0.655) ~ "Fair",
    TN>=0.655 ~ "Poor")) %>%
  mutate(CHLA_threshold=case_when(
    CHLA<=4.52 ~ "Good",
    between(CHLA, 4.52, 8.43) ~ "Fair",
    CHLA>=8.43 ~ "Poor")) %>%
  # dissolved oxygen
  mutate(d.oxygen=case_when(
    OXYGEN<=3 ~ "Poor",
    between(OXYGEN, 3, 5) ~ "Fair",
    OXYGEN>=5 ~ "Good")) %>% 
  #leech
  mutate(leech=case_when(
    TP<=30 & COLOR<=20 ~ "Blue",
    TP>30 & COLOR<=20 ~ "Green",
    TP<=30 & COLOR>20 ~ "Brown",
    TP>30 & COLOR>20 ~ "Murky"
  )) %>% 
  #chloride
  mutate(chloride=case_when(
    CHLORIDE <= 35 ~ "Low",
    between(CHLORIDE,35,250) ~ "Medium",
    CHLORIDE >= 250 ~ "High"))

sites <- read.csv("nla_2017_site_information-data.csv")

nap.sites <- sites %>% 
  mutate(include=case_when(
    AG_ECO9=="NAP" & AREA_HA>2.63 ~ "yes",
    SITE_ID=TRUE ~ "no"
  )) %>% 
  select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>% 
  filter(WGT_TP_EXTENT != 0)

nla.sites <- sites %>% 
  mutate(include=case_when(
    AREA_HA>2.63 ~ "yes",
    SITE_ID=TRUE ~ "no"
  )) %>% 
  select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>% 
  filter(WGT_TP_EXTENT != 0)

nla.att <- merge(att.nla,nla.sites,by=c("UID","SITE_ID"))
nap.att <- merge(att.nla,nap.sites,by=c("UID","SITE_ID"))
nh.att <- read.csv("New_Hampshire_Lakes_2017_Design_Status_20211027_KH_EPA.csv", na.strings=c(""," ","NA"))

# selecting parameters and adding trophic status

nh.att<-nh.att %>% 
  mutate(include=case_when(
    AREA_HA>2.63 ~ "yes",
    SITE_ID=TRUE ~ "no"))

chem <- read.csv("BasicNHChem_ForNY.csv")

#merge

nh.att <- merge(nh.att,chem,by.x="SITE_ID",by.y="NLA.ID",all.x=T) %>% 
  mutate(CHLORIDE=case_when(
    CHLORIDE!="<3" ~ as.numeric(CHLORIDE),
    CHLORIDE=="<3" ~ 0)) %>% 
  mutate(CHLORIDE_threshold=case_when(
    CHLORIDE<=35 ~ "Low",
    between(CHLORIDE,35,250) ~ "Medium",
    CHLORIDE>=250 ~ "High"
  ))

Categorical Variable Analysis

Here I perform the analysis using cat_analysis(). The relevant categorical variable / threshold should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe.

In the future, the weight argument should be part of the site design and might not be called WgtAdj.

#list variables you are interested in and defined above
vars <- c("phos_trophic", "chla_trophic", "TP_threshold", "TN_threshold", "CHLA_threshold", "microcystin", "d.oxygen", "N_LIMIT", "secchi", "zebra", "conductance", "color", "ph", "leech", "chloride","alkalinity","chlorophyte","cryptophyta","cyanobacteria","dinophyta")

#analysis
CatExtent <- cat_analysis(
  dframe=att,
  vars=vars, 
  subpops = , #for example could separate by area category or year
  siteID = "siteID",
  weight = "WgtAdj", #name will probably change in future
  xcoord = "xcoord",
  ycoord = "ycoord")

table <- CatExtent %>% 
  select(Indicator,
         Category,
         Estimate.P,
         LCB95Pct.P,
         UCB95Pct.P) %>% 
  filter(Category!="Total") %>% 
  mutate(Category=factor(Category, levels=c("Poor","Fair","Good",
                                            "Blue","Green","Brown","Murky",
                                            "High","Weak","Uncolored",
                                            "Medium","Low",
                                            "Acidic","~Neutral","Slightly alk","Highly alk",
                                            "Soft","Moderately hard","Average","Hard","Very hard",
                                            "Highly susceptible","May be susceptible","Not susceptible",
                                            "Eutrophic","Mesotrophic","Oligotrophic",
                                            "N-limited","Co-limited","P-limited",
                                            "Non-detect","Microcystin detected","Most disturbed",
                                            "Excursion","Non-excursion")),
         Indicator=case_when(
           Indicator=="phos_trophic"~"Phosphorus", 
           Indicator=="chla_trophic"~"Chlorophyll-a", 
           Indicator=="TP_threshold"~"Total phosphorus", 
           Indicator=="TN_threshold"~"Total nitrogen", 
           Indicator=="CHLA_threshold"~"Total chlorophyll", 
           Indicator=="microcystin"~"Microcystin",
           Indicator=="d.oxygen"~"Dissolved oxygen",
           Indicator=="N_LIMIT"~"Nutrient limitation", 
           Indicator=="secchi"~"Secchi", 
           Indicator=="zebra"~"Zebra mussel susceptibility", 
           Indicator=="conductance"~"Hardness", 
           Indicator=="color"~"Color", 
           Indicator=="ph"~"pH", 
           Indicator=="leech"~"Nutrient-color status", 
           Indicator=="chloride"~"Chloride",
           Indicator=TRUE~Indicator)
  )

hux <- as_hux(table) #I use huxtable because kable doesn't work on my computer but that is also fine 
number_format(hux) <- 2
theme_plain(hux)
IndicatorCategoryEstimate.PLCB95.00Pct.PUCB95.00Pct.P
PhosphorusEutrophic22.3511.2533.45
PhosphorusMesotrophic34.3318.3550.31
PhosphorusOligotrophic43.3228.9257.72
Chlorophyll-aEutrophic11.434.9717.89
Chlorophyll-aMesotrophic69.3356.1282.53
Chlorophyll-aOligotrophic19.247.0831.41
Total phosphorusFair18.316.5330.09
Total phosphorusGood64.8951.8577.92
Total phosphorusPoor16.816.0627.56
Total nitrogenFair14.265.6622.86
Total nitrogenGood66.7952.5681.02
Total nitrogenPoor18.955.8832.02
Total chlorophyllFair34.0917.7650.41
Total chlorophyllGood54.4938.3370.64
Total chlorophyllPoor11.434.9717.89
MicrocystinNon-detect100.00100.00100.00
Dissolved oxygenFair1.280.003.46
Dissolved oxygenGood91.1680.11100.00
Dissolved oxygenPoor7.550.0018.47
Nutrient limitationCo-limited17.323.5131.12
Nutrient limitationN-limited0.780.002.13
Nutrient limitationP-limited81.9068.0195.80
SecchiEutrophic46.3731.3761.37
SecchiMesotrophic48.6933.5063.88
SecchiOligotrophic4.940.809.09
Zebra mussel susceptibilityHighly susceptible29.7212.5446.90
Zebra mussel susceptibilityMay be susceptible14.870.0031.71
Zebra mussel susceptibilityNot susceptible55.4134.8076.02
HardnessAverage16.412.3830.45
HardnessHard23.197.9138.48
HardnessSoft60.3944.3776.42
ColorHigh68.6452.2085.07
ColorUncolored8.280.1416.41
ColorWeak23.0910.2535.92
pH~Neutral47.3430.0364.65
pHAcidic15.264.4226.10
pHHighly alk15.281.4929.08
pHSlightly alk22.1210.2833.95
Nutrient-color statusBlue21.597.1536.04
Nutrient-color statusBrown54.5034.9474.06
Nutrient-color statusGreen1.080.003.07
Nutrient-color statusMurky22.836.7638.90
ChlorideHigh6.600.0016.90
ChlorideLow77.1964.3790.01
ChlorideMedium16.215.6026.81
alkalinityHard11.470.1822.75
alkalinityModerately hard20.637.2434.03
alkalinitySoft67.9054.4381.37
chlorophyteHigh75.4558.6092.30
chlorophyteLow13.420.0028.43
chlorophyteMedium11.131.6020.66
cryptophytaLow100.00100.00100.00
cyanobacteriaHigh16.190.3232.07
cyanobacteriaLow65.9347.8284.03
cyanobacteriaMedium17.886.7529.00
dinophytaHigh2.110.005.90
dinophytaLow87.8478.3897.29
dinophytaMedium10.061.5118.61
ny.cat <- table %>% filter(Indicator %in% c("Total phosphorus","Total nitrogen","Total chlorophyll","Dissolved oxygen","Chloride")) %>% mutate(Study="NY")

probability.cat <- table %>% filter(Indicator %in% c("Phosphorus","Chlorophyll-a","Secchi"))

Continuous Variable Analysis

Here I perform the analysis using cont_analysis(). The relevant CONTINUOUS variable should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe with all. The output of this function is a list with 3 estimations: cumulative distribution function (CDF), percentiles, and means.

As above, in the future the weight argument should be part of the site design and might not be called WgtAdj.

#To conduct analysis on a continuous variable, using a new list of CONTINUOUS variables, also note that cont_analysis() doesn't like variable names that have spaces so you will need to rename these. It's helpful to put the units in the name.

att <- att %>% rename("CHLOROPHYLL_ug_L"=`CHLOROPHYLL A_OW_TOTAL`,
                      "NITROGEN_mg_L"=`NITROGEN, TOTAL`,
                      "PHOSPHORUS_mg_L"=`PHOSPHORUS, TOTAL_OW_TOTAL`,
                      "DISSOLVED_OXYGEN_mg_L"=`DISSOLVED OXYGEN_hypo`,
                      "ALKALINITY_mg_L"=`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,
                      "CHLORIDE_mg_L"=CHLORIDE_OW_TOTAL,
                      "CALCIUM_mg_L"=CALCIUM_OW_TOTAL,
                      "DOC_mg_L"=`CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED`,
                      "SECCHI_m"=`DEPTH, SECCHI DISK DEPTH_SD_NA`,
                      "TRUE_COLOR"=`TRUE COLOR_OW_TOTAL`,
                      "SPEC_CONDUCTANCE"=`SPECIFIC CONDUCTANCE_epi`,
                      "AMMONIA_N"=`NITROGEN, AMMONIA (AS N)_OW_TOTAL`,
                      "NP_ratio"=DINTP)

myvars <- c("CHLOROPHYLL_ug_L","NITROGEN_mg_L","PHOSPHORUS_mg_L","CHLORIDE_mg_L","DISSOLVED_OXYGEN_mg_L","DOC_mg_L","CALCIUM_mg_L","SECCHI_m","TRUE_COLOR","SPEC_CONDUCTANCE","AMMONIA_N","MAGNESIUM_OW_TOTAL","PH_hypo","ARSENIC_BS_TOTAL", "ARSENIC_OW_TOTAL", "IRON_BS_TOTAL", "IRON_OW_TOTAL",  "MAGNESIUM_BS_TOTAL", "MAGNESIUM_OW_TOTAL","ALKALINITY_mg_L","NP_ratio")

#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
  dframe = att,
  vars = myvars,
  subpops = ,
  siteID = "siteID",
  weight = "WgtAdj",
  xcoord = "xcoord",
  ycoord = "ycoord")

NY<-analysis$CDF %>% 
  select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>% 
  mutate(Study="NY") %>% 
  mutate(Value=case_when(
    Indicator=="PHOSPHORUS_mg_L"~as.numeric(Value)*1000,
    Indicator=TRUE~as.numeric(Value)
  )) %>% 
  mutate(Indicator=case_when(
    Indicator=="PHOSPHORUS_mg_L"~"PHOSPHORUS_ug_L",
    Indicator=TRUE~Indicator
  )) 

str(analysis)
## List of 3
##  $ CDF :'data.frame':    659 obs. of  15 variables:
##   ..$ Type           : chr [1:659] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
##   ..$ Subpopulation  : chr [1:659] "All Sites" "All Sites" "All Sites" "All Sites" ...
##   ..$ Indicator      : chr [1:659] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
##   ..$ Value          : num [1:659] 1.00e-10 4.45e-01 5.50e-01 6.84e-01 1.05 ...
##   ..$ nResp          : num [1:659] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ Estimate.P     : num [1:659] 1.38 2.77 3.45 4.13 4.81 ...
##   ..$ StdError.P     : num [1:659] 1.22 1.7 1.82 1.95 2 ...
##   ..$ MarginofError.P: num [1:659] 2.39 3.32 3.56 3.81 3.93 ...
##   ..$ LCB95Pct.P     : num [1:659] 0 0 0 0.318 0.886 ...
##   ..$ UCB95Pct.P     : num [1:659] 3.77 6.09 7.01 7.95 8.74 ...
##   ..$ Estimate.U     : num [1:659] 30.8 61.6 76.8 91.9 107.1 ...
##   ..$ StdError.U     : num [1:659] 27 37.4 39.6 41.9 42.6 ...
##   ..$ MarginofError.U: num [1:659] 52.9 73.4 77.5 82.1 83.5 ...
##   ..$ LCB95Pct.U     : num [1:659] 0 0 0 9.76 23.54 ...
##   ..$ UCB95Pct.U     : num [1:659] 83.7 135 154.3 174 190.6 ...
##  $ Pct :'data.frame':    147 obs. of  10 variables:
##   ..$ Type         : chr [1:147] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
##   ..$ Subpopulation: chr [1:147] "All Sites" "All Sites" "All Sites" "All Sites" ...
##   ..$ Indicator    : chr [1:147] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
##   ..$ Statistic    : chr [1:147] "5Pct" "10Pct" "25Pct" "50Pct" ...
##   ..$ nResp        : num [1:147] 5 9 15 26 33 45 48 4 4 10 ...
##   ..$ Estimate     : num [1:147] 1.08 1.08 1.08 1.08 1.08 ...
##   ..$ StdError     : num [1:147] 0.411 0.357 0.433 0.586 1.209 ...
##   ..$ MarginofError: num [1:147] 0.805 0.7 0.848 1.148 2.369 ...
##   ..$ LCB95Pct     : num [1:147] 0 0.513 1.564 2.542 4.584 ...
##   ..$ UCB95Pct     : num [1:147] 1.65 1.94 3.3 4.89 9.43 ...
##  $ Mean:'data.frame':    21 obs. of  9 variables:
##   ..$ Type         : chr [1:21] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
##   ..$ Subpopulation: chr [1:21] "All Sites" "All Sites" "All Sites" "All Sites" ...
##   ..$ Indicator    : chr [1:21] "CHLOROPHYLL_ug_L" "NITROGEN_mg_L" "PHOSPHORUS_mg_L" "CHLORIDE_mg_L" ...
##   ..$ nResp        : int [1:21] 55 47 60 55 33 41 33 59 35 38 ...
##   ..$ Estimate     : num [1:21] 5.0524 0.5559 0.0186 44.8451 4.8785 ...
##   ..$ StdError     : num [1:21] 0.45911 0.08874 0.00235 18.67918 0.74625 ...
##   ..$ MarginofError: num [1:21] 0.8998 0.1739 0.0046 36.6105 1.4626 ...
##   ..$ LCB95Pct     : num [1:21] 4.153 0.382 0.014 8.235 3.416 ...
##   ..$ UCB95Pct     : num [1:21] 5.9522 0.7298 0.0232 81.4556 6.3411 ...

Plots

Here I have plotted the analyses from above. The categorical variables are plotted as dots, maps and bars; the continuous variables are plotted as a distribution function. They are also grouped by QAPP parameter categories in the “Grouped plots” tab.

Category dot plot

plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
  geom_point()+
  geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
  theme(legend.position = "none")+
  facet_wrap(.~Indicator,scales = "free")+
  ylim(0,100)+
  labs(title="Condition estimates across NYS Ponded Waters",y="Percent of Total",x="Condition category")+
  theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))

plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot)

Cumulative distribution function

#To plot a cumulative distribution function:
plot <- ggplot(analysis$CDF,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
  geom_line()+
  geom_point()+
  geom_ribbon(alpha=0.5)+
  ylim(0,100)+
  facet_wrap(.~Indicator, scales="free")+
  guides(fill=FALSE,shape=FALSE)+
  labs(y="Percent of lakes")

plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot)

Map

#fetch map
library(ggmap)
library(maps)

states <- map_data("state")
ny <- subset(states, region %in% c("new york"))

att$CHLA_threshold <- factor(att$CHLA_threshold, levels=c("Good","Fair","Poor"))
ggplot() + 
  geom_polygon(data=ny,aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) +
  geom_point(data=att,aes(x=xcoord,y=ycoord,color=CHLA_threshold))+
  guides(fill=FALSE)+
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

Grouped plots

Categorical

#list variables you are interested in and defined above
trophic <- c("phos_trophic","chla_trophic","N_LIMIT","secchi","leech","color")
minerals <- c("zebra","alkalinity")
in.situ <- c("ph","conductance")
habs <- c("microcystin","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
fishing <- c("AMMONIA_excursions","DO_excursions","NITRITE_excursions","PH_excursions","Fully_support")

vars.list <- list(Trophic=trophic,Minerals=minerals,`In Situ`=in.situ,HABs=habs,`Fishing use`=fishing)

n<-0

#analysis
for(i in vars.list){
  
  n<-n+1
  
  CatExtent <- cat_analysis(
    dframe=att,
    vars=i, 
    subpops = , #for example could separate by area category or year
    siteID = "siteID",
    weight = "WgtAdj", #name will probably change in future
    xcoord = "xcoord",
    ycoord = "ycoord")
  
  table <- CatExtent %>% 
    select(Indicator,
           Category,
           Estimate.P,
           LCB95Pct.P,
           UCB95Pct.P) %>% 
    filter(Category!="Total") %>% 
    mutate(Category=case_when(
      Category==0  ~ "Non-excursion",
      Category==1 ~ "Excursion",
      Category=TRUE~Category
    )) %>% 
    mutate(Category=factor(Category, levels=c("Poor","Fair","Good",
                                              "Blue","Green","Brown","Murky",
                                              "High","Weak","Uncolored",
                                              "Medium","Low",
                                              "Acidic","~Neutral","Slightly alk","Highly alk",
                                              "Soft","Moderately hard","Average","Hard","Very hard",
                                              "Highly susceptible","May be susceptible","Not susceptible",
                                              "Eutrophic","Mesotrophic","Oligotrophic",
                                              "N-limited","Co-limited","P-limited",
                                              "Non-detect","Microcystin detected","Most disturbed",
                                              "Excursion","Non-excursion")),
           Indicator=case_when(
             Indicator=="phos_trophic"~"Phosphorus", 
             Indicator=="chla_trophic"~"Chlorophyll-a", 
             Indicator=="TP_threshold"~"Total phosphorus", 
             Indicator=="TN_threshold"~"Total nitrogen", 
             Indicator=="CHLA_threshold"~"Total chlorophyll", 
             Indicator=="microcystin"~"Microcystin",
             Indicator=="d.oxygen"~"Dissolved oxygen",
             Indicator=="N_LIMIT"~"Nutrient limitation", 
             Indicator=="secchi"~"Secchi", 
             Indicator=="zebra"~"Zebra mussel susceptibility", 
             Indicator=="conductance"~"Conductance", 
             Indicator=="color"~"Color", 
             Indicator=="ph"~"pH", 
             Indicator=="leech"~"Nutrient-color status", 
             Indicator=='alkalinity'~"Alkalinity",
             Indicator=="chlorophyte"~"% Chlorophyte",
             Indicator=="cryptophyta"~"% Cryptophyta",
             Indicator=="cyanobacteria"~"% Cyanobacteria",
             Indicator=="dinophyta"~"% Dinophyta",
             Indicator=="AMMONIA_excursions"~"Ammonia",
             Indicator=="DO_excursions"~"Dissolved oxygen",
             Indicator=="NITRITE_excursions"~"Nitrite",
             Indicator=="PH_excursions"~"pH",
             Indicator=="Fully_support"~"All parameters",
             Indicator=TRUE~Indicator)
    )
  
  if(names(vars.list)[n]=="Fishing use"){
    plot1<-ggplot(table[table$Indicator=="All parameters",],aes(x=Category,y=Estimate.P)) +
      geom_col()+
      geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
      theme(legend.position = "none")+
      ylim(0,100)+
      labs(y="Percent of Total",x="Fully supporting",title="Fishing Use Parameters")+
      theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
    
    plot2<-ggplot(table[table$Indicator!="All parameters",],aes(x=Indicator,y=Estimate.P,fill=Category)) +
      geom_bar(position="stack", stat="identity")+
      ylim(0,100)+
      labs(y="",x="Parameter")+
      theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))+
      scale_fill_grey()
    
    plot1 <- set_panel_size(p = plot1, width = unit(4, "cm"), height = unit(4,"cm"))
    plot2 <- set_panel_size(p = plot2, width = unit(6, "cm"), height = unit(4,"cm"))
    
    gridExtra::grid.arrange(plot1,plot2,ncol=2)
  }else{
    plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
      geom_col()+
      geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
      theme(legend.position = "none")+
      facet_wrap(.~Indicator,scales = "free")+
      ylim(0,100)+
      labs(y="Percent of Total",x="Condition category",title=paste(names(vars.list)[n],"Parameters"))+
      theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
    
    plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
    
    gridExtra::grid.arrange(plot)
  }
  
  
  
}

Continuous

#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
             "NITROGEN_mg_L",
             "PHOSPHORUS_mg_L",
             "SECCHI_m",
             "TRUE_COLOR",
             "NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
             "PH_hypo",
             "SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
            "ARSENIC_OW_TOTAL",
            "IRON_BS_TOTAL",
            "IRON_OW_TOTAL",
            "MAGNESIUM_BS_TOTAL",
            "MAGNESIUM_OW_TOTAL")
habs <- c("CYANOBACTERIA")

vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals,HABs=habs)

n<-0

for(i in vars.list){
  
  n<-n+1
  
  #Creates 3 estimations in list: CDF, percentiles, means.
  analysis <- cont_analysis(
    dframe = att,
    vars = i,
    subpops = ,
    siteID = "siteID",
    weight = "WgtAdj",
    xcoord = "xcoord",
    ycoord = "ycoord")
  
  table<-analysis$CDF %>% 
    select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>% 
    filter(Indicator %in% i) %>% 
    mutate(Indicator=case_when(
      Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
      Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
      Indicator=="PHOSPHORUS_mg_L"~"Total phosphorus (mg/L)",
      Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
      Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
      Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
      Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
      Indicator=="SECCHI_m"~"Secchi disk depth (m)",
      Indicator=="TRUE_COLOR"~"True color (PCU)",
      Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
      Indicator=="PH_hypo"~"pH (SU)",
      Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
      Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
      Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
      Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
      Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
      Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
      Indicator=="NP_ratio"~"DIN:TP ratio",
      Indicator=="CYANOBACTERIA"~"% Cyanobacteria",
      Indicator=TRUE~Indicator
    )) %>% 
    mutate(Lower=case_when(
      Indicator=="Chlorophyll-a (ug/L)"~2,
      Indicator=="Total phosphorus (mg/L)"~0.01,
      Indicator=="Specific conductance (mS/cm)"~125,
      Indicator=="Alkalinity (mg/L)"~60,
      Indicator=="Calcium (mg/L)"~10,
      Indicator=="True color (PCU)"~10,
      Indicator=="Secchi disk depth (m)"~2,
      Indicator=TRUE~as.numeric(""))) %>% 
    mutate(Upper=case_when(
      Indicator=="Chlorophyll-a (ug/L)"~8,
      Indicator=="Total phosphorus (mg/L)"~0.02,
      Indicator=="Specific conductance (mS/cm)"~250,
      Indicator=="Alkalinity (mg/L)"~120,
      Indicator=="Calcium (mg/L)"~20,
      Indicator=="True color (PCU)"~20,
      Indicator=="Secchi disk depth (m)"~5,
      Indicator=TRUE~as.numeric("")))
  
  plot<-ggplot(table,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
    geom_line()+
    geom_point()+
    geom_ribbon(alpha=0.5)+
    ylim(0,100)+
    guides(fill="none",shape="none")+
    facet_wrap(.~Indicator, scales="free")+
    geom_vline(aes(xintercept=Lower),linetype="dashed")+
    geom_vline(aes(xintercept=Upper),linetype="dashed")+
    labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))+
    scale_color_grey()
  
  plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
  
  gridExtra::grid.arrange(plot)
  
}

Comparisons

Here I make the comparisons to NLA and NH data from above, as well as to historical LMAS sampling. Thresholds for NLA, NH and NY comparisons are from the NLA and are “good”, “fair”, and “poor” for chlorophyll-a, phosphorus, nitrogen and epilimnetic dissolved oxygen. There are no thresholds for chloride.

The values for LMAS sampling are derived from a separate script that counts the number of lakes in each size class from samples in the last 10 years.

Category dot plot

## During execution of the program, 3 warning messages were generated.  The warning 
## messages are stored in a data frame named 'warn_df'.  Enter the following 
## command to view the warning messages: warnprnt() 
## To view a subset of the warning messages (say, messages number 1, 3, and 5), 
## enter the following command: warnprnt(m=c(1,3,5))
## During execution of the program, a warning message was generated.  The warning 
## message is stored in a data frame named 'warn_df'.  Enter the following command 
## to view the warning message: warnprnt()
## During execution of the program, 10 warning messages were generated.  The warning 
## messages are stored in a data frame named 'warn_df'.  Enter the following 
## command to view the warning messages: warnprnt() 
## To view a subset of the warning messages (say, messages number 1, 3, and 5), 
## enter the following command: warnprnt(m=c(1,3,5))
cats <- rbind(ny.cat,nh.cat,nla.cat,nap.cat)

plot <- ggplot(cats,aes(x=Category,y=Estimate.P,color=Study,shape=Study))+
  geom_point(position=position_dodge(width=0.2))+
  ylim(0,100)+
  geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=.25, position=position_dodge(width=0.2))+
  facet_wrap(.~Indicator, scales="free")+
  labs(title="Condition category extent estimates")

plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot)

Cumulative distribution function

## During execution of the program, a warning message was generated.  The warning 
## message is stored in a data frame named 'warn_df'.  Enter the following command 
## to view the warning message: warnprnt()
## During execution of the program, a warning message was generated.  The warning 
## message is stored in a data frame named 'warn_df'.  Enter the following command 
## to view the warning message: warnprnt()
#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
             "NITROGEN_mg_L",
             "PHOSPHORUS_ug_L",
             "SECCHI_m",
             "TRUE_COLOR",
             "NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
             "PH_hypo",
             "SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
            "ARSENIC_OW_TOTAL",
            "IRON_BS_TOTAL",
            "IRON_OW_TOTAL",
            "MAGNESIUM_BS_TOTAL",
            "MAGNESIUM_OW_TOTAL")

vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals)

n<-0

for(i in vars.list){
  
  n<-n+1
  
  table<-all %>% 
    filter(Indicator %in% i) %>% 
    mutate(Indicator=case_when(
      Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
      Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
      Indicator=="PHOSPHORUS_ug_L"~"Total phosphorus (ug/L)",
      Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
      Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
      Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
      Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
      Indicator=="SECCHI_m"~"Secchi disk depth (m)",
      Indicator=="TRUE_COLOR"~"True color (PCU)",
      Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
      Indicator=="PH_hypo"~"pH (SU)",
      Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
      Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
      Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
      Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
      Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
      Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
      Indicator=="DOC_mg_L"~"Dissolved organic carbon (mg/L)",
      Indicator=="NP_ratio"~"DIN:TP ratio",
      Indicator=TRUE~Indicator
    )) %>%
    filter((Indicator=="Chlorophyll-a (ug/L)"&Value<50)|
             (Indicator=="Chloride (mg/L)"&Value<250)|
             (Indicator=="Total nitrogen (mg/L)"&Value<2.5)|
             (Indicator=="Total phosphorus (ug/L)"&Value<150)|
             (Indicator=="Dissolved organic carbon (mg/L)"&Value<50)|
             (Indicator=="Calcium (mg/L)"&Value<50)|
             (Indicator=="Conductance (mS/cm)" & Value<1000)|
             (Indicator=="Magnesium, open water (ug/L)"&Value<25000)|
             !(Indicator %in% c("Chlorophyll-a (ug/L)","Chloride (mg/L)","Total nitrogen (mg/L)", "Total phosphorus (ug/L)","Dissolved organic carbon (mg/L)","Calcium (mg/L)","Conductance (mS/cm)","Magnesium, open water (ug/L)"))) %>% 
    mutate(Study=factor(Study, levels=c("NY","NLA","NAP","NH")))
  
  plot <- ggplot(table,aes(x=Value,y=Estimate.P,color=Study,shape=Study,linetype=Study))+
    geom_line()+
    ylim(0,100)+
    scale_linetype_manual(values=c("solid","dotdash", "dotted","dashed"))+
    facet_wrap(.~Indicator, scales="free")+
    labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))
  
  plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
  
  gridExtra::grid.arrange(plot)
  
}

Historical bias

The plot below shows the distribution of the size classes and trophic classes included in the data frame. For comparison, I’ve plotted the proportion of NYS lakes sampled in each size class since 2010.

#Plotting data frame distribution as well as NYS sample distribution collected since 2010
#LMAS numbers retrieved in 2020

frame<-att %>% 
  select(PROB_CAT,AREA_CAT) %>% 
  rename(size_category=AREA_CAT) %>% 
  distinct() %>%
  mutate(Probability_Sites = case_when( #statewide estimate
    endsWith(size_category, "(1,4]") ~ ((1828/6437)*100),
    endsWith(size_category, "(4,10]") ~ ((2490/6437)*100),
    endsWith(size_category, "(10,20]") ~ ((1003/6437)*100),
    endsWith(size_category, "(20,50]") ~ ((616/6437)*100),
    endsWith(size_category, ">50") ~ ((500/6437)*100))) %>%
  mutate(LMAS_Sites = case_when(
    endsWith(size_category, "(1,4]") ~ (7.64),
    endsWith(size_category, "(4,10]") ~ (16.7),
    endsWith(size_category, "(10,20]") ~ (16.2),
    endsWith(size_category, "(20,50]") ~ (19.3),
    endsWith(size_category, ">50") ~ (40.1))) %>% 
  rename("Statewide"=Probability_Sites,
         "LMAS"=LMAS_Sites) %>% 
  ungroup() %>%
  select(-PROB_CAT) %>% 
  gather(study,percentage,-size_category) %>% 
  arrange(study,size_category) %>% 
  mutate(percentage=as.numeric(percentage),
         size_category=factor(size_category,levels=c("(1,4]","(4,10]","(10,20]","(20,50]",">50")))

plot <- ggplot(frame, aes(x=size_category,y=percentage,fill=study)) +
  geom_bar(stat = "identity", position = position_dodge())+
  labs(title="Size classes",y="Percent of Total",x="Size category (hectares)")+
  scale_fill_grey()+
  theme(legend.position = "none")

plot1 <- set_panel_size(p = plot, width = unit(6, "cm"), height = unit(4,"cm"))
forplot<-probability.cat %>% 
  mutate(study="Statewide") %>% 
  add_row(Indicator="Chlorophyll-a",
          Category="Eutrophic",
          study="LMAS",
          Estimate.P=((215/473)*100)) %>% 
  add_row(Indicator="Chlorophyll-a",
          Category="Mesotrophic",
          study="LMAS",
          Estimate.P=((182/473)*100)) %>% 
  add_row(Indicator="Chlorophyll-a",
          Category="Oligotrophic",
          study="LMAS",
          Estimate.P=((76/473)*100)) %>% 
  add_row(Indicator="Phosphorus",
          Category="Eutrophic",
          study="LMAS",
          Estimate.P=((211/473)*100)) %>% 
  add_row(Indicator="Phosphorus",
          Category="Mesotrophic",
          study="LMAS",
          Estimate.P=((141/473)*100)) %>% 
  add_row(Indicator="Phosphorus",
          Category="Oligotrophic",
          study="LMAS",
          Estimate.P=((121/473)*100)) %>% 
  add_row(Indicator="Secchi",
          Category="Eutrophic",
          study="LMAS",
          Estimate.P=((371/700)*100)) %>% 
  add_row(Indicator="Secchi",
          Category="Mesotrophic",
          study="LMAS",
          Estimate.P=((279/700)*100)) %>% 
  add_row(Indicator="Secchi",
          Category="Oligotrophic",
          study="LMAS",
          Estimate.P=((50/700)*100)) %>% 
  mutate(LCB95Pct.P=ifelse(is.na(LCB95Pct.P),Estimate.P,LCB95Pct.P),
         UCB95Pct.P=ifelse(is.na(UCB95Pct.P),Estimate.P,UCB95Pct.P))


plot <- ggplot(forplot, aes(x=Category,fill=study)) +
  geom_bar_pattern(stat = "identity", position = position_dodge(),
                   aes(y=Estimate.P,
                       pattern_type = study,
                       pattern_fill = study), 
                   pattern= 'magick',
                   fill= 'white',
                   colour= 'black',
                   pattern_scale = 0.5,
                   pattern_key_scale_factor = 1.1)+
  geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
                position=position_dodge(.9))+
  facet_wrap(~Indicator,scales = "free")+
  ylim(0,100)+
  labs(title="Trophic Classes across NYS versus LMAS samples since 2010",y="Percent of Total",x="Trophic Classes")+
  theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
        legend.key.height = unit(1, 'cm'),
        legend.key.width = unit(1, 'cm'))+
  scale_pattern_type_discrete(choices = c("gray15","bricks"))

# plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
# 
# gridExtra::grid.arrange(plot)
plot2 <- ggplot(forplot[forplot$Indicator=="Phosphorus",], aes(x=Category,fill=study,y=Estimate.P)) +
  geom_bar(stat = "identity", position = position_dodge())+
  geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
                position=position_dodge(.9))+
  labs(title="Phosphorus",x="Trophic class",y="")+
  theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
        legend.key.height = unit(1, 'cm'),
        legend.key.width = unit(1, 'cm'))+
  scale_fill_grey()

plot2 <- set_panel_size(p = plot2, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot1, plot2,ncol=2)

Session info

sessionInfo(package=NULL)
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] maps_3.4.0      ggmap_3.0.0     spsurvey_5.0.1  survey_4.1-1   
##  [5] survival_3.2-13 Matrix_1.3-4    sf_1.0-3        ggpattern_0.4.2
##  [9] egg_0.4.5       gridExtra_2.3   lubridate_1.8.0 huxtable_5.4.0 
## [13] forcats_0.5.1   stringr_1.4.0   dplyr_1.0.7     purrr_0.3.4    
## [17] readr_2.0.2     tidyr_1.1.4     tibble_3.1.6    ggplot2_3.3.5  
## [21] tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-153        bitops_1.0-7        fs_1.5.0           
##  [4] httr_1.4.2          tools_4.1.2         backports_1.3.0    
##  [7] bslib_0.3.1         utf8_1.2.2          R6_2.5.1           
## [10] KernSmooth_2.23-20  AlgDesign_1.2.0     DBI_1.1.1          
## [13] colorspace_2.0-2    sp_1.4-5            withr_2.4.2        
## [16] tidyselect_1.1.1    compiler_4.1.2      cli_3.1.0          
## [19] rvest_1.0.2         xml2_1.3.2          labeling_0.4.2     
## [22] sass_0.4.0          scales_1.1.1        classInt_0.4-3     
## [25] proxy_0.4-26        commonmark_1.7      crossdes_1.1-1     
## [28] digest_0.6.28       minqa_1.2.4         rmarkdown_2.11     
## [31] jpeg_0.1-9          pkgconfig_2.0.3     htmltools_0.5.2    
## [34] lme4_1.1-27.1       highr_0.9           dbplyr_2.1.1       
## [37] fastmap_1.1.0       rlang_0.4.12        readxl_1.3.1       
## [40] rstudioapi_0.13     farver_2.1.0        jquerylib_0.1.4    
## [43] generics_0.1.1      jsonlite_1.7.2      gtools_3.9.2       
## [46] magrittr_2.0.1      Rcpp_1.0.7          munsell_0.5.0      
## [49] fansi_0.5.0         lifecycle_1.0.1     stringi_1.7.5      
## [52] yaml_2.2.1          MASS_7.3-54         plyr_1.8.6         
## [55] crayon_1.4.2        deldir_1.0-6        lattice_0.20-45    
## [58] haven_2.4.3         splines_4.1.2       hms_1.1.1          
## [61] knitr_1.36          pillar_1.6.4        rjson_0.2.20       
## [64] boot_1.3-28         reprex_2.0.1        glue_1.5.0         
## [67] evaluate_0.14       mitools_2.4         modelr_0.1.8       
## [70] png_0.1-7           vctrs_0.3.8         nloptr_1.2.2.3     
## [73] tzdb_0.2.0          RgoogleMaps_1.4.5.3 cellranger_1.1.0   
## [76] gtable_0.3.0        assertthat_0.2.1    xfun_0.28          
## [79] broom_0.7.10        e1071_1.7-9         class_7.3-19       
## [82] units_0.7-2         ellipsis_0.3.2

Time elapsed

end.time <- Sys.time()

elapsed.time <- round((end.time - start.time), 3)
print(elapsed.time)
## Time difference of 15.012 mins